package com.tledu.spring.core.tools;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * @author Lsj
 * @version 1.0
 * @date 2021/7/8 14:40
 * 爬取工具类
 */
public class DemoAutoNewsCrawler extends BreadthCrawler {
    public static String toHtml;
    public static String totoHtml;

    public DemoAutoNewsCrawler(String crawlPath,boolean autoParse,String url){
        super(crawlPath,autoParse);
        // 还可以有list分页添加和正则表达式判断是否过滤
        this.addSeed(url); ;
        // 设置线程数
        setThreads(50);
        getConf().setTopN(100);
    }

    /**
     * 重写
     * @param page
     * @param crawlDatums
     */
    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        System.out.println(page.html());
        toHtml = page.selectText("div.read-content.j_readContent");
    }

    public String returnHHtml(){
        return totoHtml;
    }

    public List<String> returnHtml(){
        String[] arr = toHtml.split("\\s+");
        List<String > lists = new ArrayList<>(Arrays.asList(arr));
        return lists;
    }

    public static void main(String[] args) {

    }
}
